This script is used to produce the analyses of the paper. Our data is preprocessed in the data_processing.Rmd file.

Last edit: 22/05/2024

##Setup

Table 1 and Supplementary Table 1: barcodes

Information about the barcodes used with Sper01 marker.

extraction <- read.csv("data/extraction.csv")

barcodes <- read_excel("data/new_plant_positive_control.xlsx")[,c(2,4:6)]
## New names:
## • `` -> `...1`
names(barcodes) <- c("species_name", "sequence", "length", "gc_content")

barcodes[barcodes$species_name ==
            "Salvia_pratensis",]$sequence <-
  "atcctgttttctcaaaacaaaggttcaaaaaacgaaaaaaaaaaag"

barcodes <- barcodes %>% filter(species_name != "Rumex_acetosa")
#Species not used in this experiment
#Information about barcodes and extracted concentrations
barcodes %>% left_join(extraction[,-2])
## Joining with `by = join_by(species_name)`
## # A tibble: 30 × 5
##    species_name       sequence                     length gc_content CDNA_sample
##    <chr>              <chr>                         <dbl>      <dbl>       <dbl>
##  1 Taxus_baccata      atccgtattataggaacaataatttta…     41       24.4        7.94
##  2 Taxus_baccata      atccgtattataggaacaataatttta…     41       24.4       13.3 
##  3 Salvia_pratensis   atcctgttttctcaaaacaaaggttca…     45       26.7       20.8 
##  4 Salvia_pratensis   atcctgttttctcaaaacaaaggttca…     45       26.7       24.4 
##  5 Populus_tremula    atcctatttttcgaaaacaaacaaaaa…     68       25         33   
##  6 Populus_tremula    atcctatttttcgaaaacaaacaaaaa…     68       25         31.4 
##  7 Carpinus_betulus   atcctgttttcccaaaacaaataaaac…     61       27.9       13.1 
##  8 Carpinus_betulus   atcctgttttcccaaaacaaataaaac…     61       27.9        9.14
##  9 Fraxinus_excelsior atcctgttttcccaaaacaaaggttca…     39       33.3        6.56
## 10 Fraxinus_excelsior atcctgttttcccaaaacaaaggttca…     39       33.3       22.4 
## # ℹ 20 more rows

Estimation of K from PCR composition

https://assets.fishersci.com/TFS-Assets/LSG/brochures/AmpliTaq-Gold-360-DNA-Polymerase-QRC-061444.pdf

Molar mass of 1 dNTP: \(M=327g/mol\) in average (not needed)

https://www.thermofisher.com/fr/fr/home/references/ambion-tech-support/rna-tools-and-calculators/dna-and-rna-molecular-weights-and-conversions.html

Molar concentration of the 4 dNTP in well: \(c_n=1 mM = 1.10^{-3}mol/l\)

https://www.thermofisher.com/fr/fr/home/life-science/cloning/cloning-learning-center/invitrogen-school-of-molecular-biology/pcr-education/pcr-reagents-enzymes/pcr-component-considerations.html

Quantity of primers in well: \(n_p= 20pmol\)/PCR ?

Volume of well: \(V = 20 \mu l\)

Constant of Avogadro \(N_A = 6.02214076\times 10^{23} \: mol^{-1}\)

Average length of a sequence: \(L=51\) dNTP

Thus, \(K=\frac{min(c_n\times V, \: n_p)}{L}\times N_A\)

This is the maximum number of molecules that can be created in a well. Around 1e14.

c_n <- 4*0.2e-3 #concentration of dNTP, mol/l (4*0.2mM)
Vwell <- 20e-6 #volume of a well, l
N_A <- 6.02214076e23 #Avogadro
L <- barcodes$sequence %>% nchar %>% mean
#mean length of a sequence, number of nucleotides

n_p <- 2*5e-6*2e-6 #quantity of primers, mol : 2 (F+R) * 5uM * 2ul 

Ksup <- min(c_n*Vwell/L, n_p)*N_A
Ksup <- n_p*N_A

K <- Ksup

Figure 1: PCR models

Fitting exponential and logistic PCR models to qPCR data (SybrGreen) of Capsella bursa-pastoris. Data and script are described in Supplementary File 1.

qPCR data

Converting RFU to number of molecules for illustrative purpose (scaling).

#Open file containing qPCR data
ampli <- read.csv("data/20210712_101200_CT030043_SPER01_ANALYSE -  Quantification Amplification Results_SYBR.csv")[,-1]

ampli <- ampli %>% pivot_longer(2:ncol(ampli))
colnames(ampli)[2:3]<- c("Well","RFU")
ampli <- ampli[ampli$Well == "B7",] #B7: species Cbp, sample 26

#Convert RFU to molecules
slope <- K/(max(ampli$RFU)-min(ampli$RFU))
intercept <- -slope*min(ampli$RFU)
ampli$molecules <- slope * ampli$RFU + intercept

data_ampli <- as.matrix(ampli[,c("Cycle", "RFU")])
data_ampli[,1] <- data_ampli[,1] - 1

ncycles <- nrow(ampli)

Fitting PCR models

Parameters are fitted numerically. No biological consideration here. The function (pcrfit) and the data structure used come from R package qpcR.

Non-zero low weights are given for numerical reasons.

mexp <- cm3 #Reuse model object from package qPCR

#Define model

mexp$expr <- "Fluo ~ mexp$fct(Cycles, c(M0, Lam))"
mexp$fct <- function(x, parm) {
  M0 <- parm[1]
  Lam <- parm[2]
  Fn <- vector(mode = "numeric", length = length(x))
  for (i in 1:length(x)) {
    if (i == 1) Fn[i] <- M0
    else {
      Fn[i] <- Fn[i - 1] * (1 + Lam*(1-Fn[i - 1]/K))
    }
  }
  return(Fn)
}
mexp$ssFct <- function(x, y) {
  ## start estimates
  M0 <- 1
  Lam <- 1
  ssVal <- c(M0, Lam)
  names(ssVal) <- mexp$parnames
  return(ssVal)
}
mexp$inv <- function(y, parm) {
  x <- 1:100
  fn <- function(x, parm) mexp$fct(x, parm) - y
  uniroot(fn, interval = c(1, 100), parm)$root
}
mexp$expr.grad <- expression(mexp$fct(Cycles, c(M0, Lam)))
mexp$parnames <- c("M0", "Lam")
mexp$name <- "mexp"
mexp$type <- "my exponential model"


#Weight cycles

weights_exp <- c(rep(0, 5), rep(1, 9), rep(1000, 6), rep(0, 41))

fitexp <- pcrfit(data_ampli, 1, 2, mexp,
                 weights = weights_exp)

res_exp <- slope*fitexp$m$predict(newdata = fitexp$DATA) + intercept
mlog <- cm3 #Reuse model object from package qPCR

#Define model

mlog$expr <- "Fluo ~ mlog$fct(Cycles, c(M0, Lam))"
mlog$fct <- function(x, parm) {
  M0 <- parm[1]
  Lam <- parm[2]
  K <- max(ampli$RFU) #specific to this amplification
  Fn <- vector(mode = "numeric", length = length(x))
  for (i in 1:length(x)) {
    if (i == 1) Fn[i] <- M0
    else {
      if (Fn[i - 1] < K){Fn[i] <- Fn[i - 1] * (1 + Lam*(1-Fn[i - 1]/K)) }
      else {Fn[i] <- Fn[i - 1]}
    }
  }
  return(Fn)
}
mlog$ssFct <- function(x, y) {
  ## start estimates
  M0 <- 1
  Lam <- 1
  K <- max(ampli$RFU)
  ssVal <- c(M0, Lam)
  names(ssVal) <- mlog$parnames
  return(ssVal)
}
mlog$inv <- function(y, parm) {
  x <- 1:100
  fn <- function(x, parm) mlog$fct(x, parm) - y
  uniroot(fn, interval = c(1, 100), parm)$root
}
mlog$expr.grad <- expression(mlog$fct(Cycles, c(M0, Lam)))
mlog$parnames <- c("M0", "Lam")
mlog$name <- "mlog"
mlog$type <- "my logistic model"


#Weight cycles

weights_log <- c(rep(0, 5), rep(1, 9), rep(1000, 10), rep(1, 15), rep(0, 22))

fitlog <- pcrfit(data_ampli, 1, 2, mlog,
                 weights = weights_log)

res_log <- slope*fitlog$m$predict(newdata = fitlog$DATA) + intercept

Figure 1: Fitted PCR models

ggplot()+
        geom_point(data = ampli, aes(Cycle-1, molecules))+
        geom_line(data = NULL, aes(0:60, res_log,
                                   col = "Logistic"),
                  linewidth = 0.75)+
        geom_point(data = ampli, aes(Cycle-1, molecules))+
        geom_line(data = NULL, aes(0:60, res_exp,
                                   col = "Exponential"),
                  linewidth = 0.75)+
        labs(x = "Cycle",
             y = "Estimated number of molecules", 
             col = "Amplification model")+
        theme(legend.justification = c(1, 0),
              legend.position = c(1, 0),
        legend.box.margin=margin(rep(10, 4)))+
  scale_colour_brewer(palette = "Set1")+
    # scale_y_log10()+
  coord_cartesian(xlim = c(0, 50), ylim = c(1, K))

#+
        # ggtitle(
        #   paste0("Real qPCR data and saturation models - Species : ",
        #                "Capsella bursa-pastoris"))+
        #   theme(plot.title = element_text(hjust = 0.5))

# ggsave(filename = "Figures/saturation_models2.pdf",
#        dpi = 600, units = "mm",
#        width = 150, height = 100, device = "pdf")

Digital droplet PCR (ddPCR) assays

ddPCR data

Dataframe summary_experiment contains all info concerning the concentration assays.

#QuantaSoft output file with additional data
dataddpcr <- read_csv("data/dataddpcr.csv")
## Rows: 85 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Well, Status, sp
## dbl (8): Concentration, Negatives, AcceptedDroplets, Sample, Dilu, plate, To...
## lgl (1): Used
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Used columns:
# Sample: Number of the plant sample
# Concentration: Assayed copies/ul in well
# sp: species
# TotalDNA: Total DNA concentration in ddPCR wells, ng/ul

#Open extraction data
#Used columns:
# species name
# Sample: Number of the plant sample
# CDNA_sample: Concentration of total DNA in the sample assayed by Qubit, ng/ul

Vdna_dd <- 5 #Volume of DNA in a well in ul
Vmix_dd <- 20 #Volume of mix in a well in ul

#summary_experiment contains all info concerning the concentration assays
#Molecules_ng is the number of target molecules / ng of total DNA
summary_experiment <- dataddpcr %>% group_by(Sample) %>%
  summarise(sp = sp[1],
            Molecules_ng = mean(Concentration/TotalDNA)) %>%
  left_join(extraction) %>%
  dplyr::select(Sample, species_name, sp, Molecules_ng, CDNA_sample)
## Joining with `by = join_by(Sample)`

Metabarcoding preparation

DNA concentration in the different communities

Uniform community (M_U)

CDNA_U <- 0.5
#Total DNA concentration in ng/ul for all plants
#in the U community (concentration in sample, not in pcr mix)
C_species_ref <- CDNA_U/13 #concentration of each species, ng/ul

summary_experiment <- summary_experiment %>%
  mutate(Volume_equi = max(Molecules_ng)/Molecules_ng,
         cDNA_U = C_species_ref*Volume_equi,
         #Concentration of total DNA to use in the U commu
         molecules_U = cDNA_U*Molecules_ng, #Molecules/ul in sample
         propU = molecules_U/sum(molecules_U), #species proportion
         Volume_equi = max(Molecules_ng)/Molecules_ng,
         #Volume to take to be as concentrated as 1 ul of Ptr 6 (most concentrated)
          RankG = 13 - rank(CDNA_sample/Volume_equi) + 1)

Number of molecules in the M_U community

VDNA_metab <- 2 #Volume of DNA used in the mix, ul
VDNA_metab * summary_experiment$molecules_U #Molecules of each species in mix
##  [1] 19084.31 19084.31 19084.31 19084.31 19084.31 19084.31 19084.31 19084.31
##  [9] 19084.31 19084.31 19084.31 19084.31 19084.31
VDNA_metab * sum(summary_experiment$molecules_U) #Total number of molecules
## [1] 248096

“Uniform in Total DNA” Communauty (M_T)

Number of molecules in the M_T community

summary_experiment <- summary_experiment %>%
  mutate(molecules_T = C_species_ref*Molecules_ng, #molecules/ul of sample
         propT = molecules_T/sum(molecules_T)) #species proportions

VDNA_metab * summary_experiment$molecules_T #Molecules of each species in mix
##  [1] 11760.000 19084.308  4051.282  6695.385  5672.308  6896.410 10360.237
##  [8]  9659.487 16280.000  2964.103  6330.256  2883.282 10806.154
VDNA_metab * sum(summary_experiment$molecules_T) #total number of molecules
## [1] 113443.2

Geometric community (M_G)

Number of molecules in the M_G community (2-fold dilution)

summary_experiment <- summary_experiment %>%
  mutate(Volume_equi = max(Molecules_ng)/Molecules_ng,
         cDNA_G = CDNA_U/2^RankG*Volume_equi,
         molecules_G = cDNA_G*Molecules_ng,
         propG = molecules_G/sum(molecules_G))

summary_experiment$molecules_G * VDNA_metab #Molecules of each species in mix
##  [1]   7753.00000  15506.00000    121.14063   1938.25000   3876.50000
##  [6]     60.57031    969.12500 124048.00000  62024.00000    484.56250
## [11]    242.28125     30.28516  31012.00000
sum(summary_experiment$molecules_G) * VDNA_metab #Total number of molecules
## [1] 248065.7

Figure 3: ddPCR assays

Target DNA concentrations of each sample

data_dd_fig <- dataddpcr %>%
  left_join(summary_experiment %>% dplyr::select(sp, species_name, RankG))
## Joining with `by = join_by(sp)`
order_short <- summary_experiment %>% dplyr::select(sp, RankG) %>%
  arrange(-RankG) %>% pull(sp)

highlight <- function(x, pat, color="black", family="") {
  ifelse(grepl(pat, x), glue("<b style='font-family:{family}; color:{color}'>{x}</b>"), x)
}

ggplot(data_dd_fig)+
  geom_point(aes(factor(sp, levels = order_short, ordered = T),
                 Concentration_copies_ng/1000,
                 shape = as.factor(TotalDNA)))+
  geom_point(data = summary_experiment,
             aes(as.factor(sp), Molecules_ng/1000),
             fill = "red", shape = 23, size = 3)+
  # ggtitle("Target DNA molecules at given Total DNA concentration")+
  labs(x = "Plant species",
       y = "Thousands of molecules per ng of total DNA\n",
       shape = TeX("Concentration (ng/$\\mu l$)"))+
  scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))+
  # theme_bw()+
  theme(axis.text.x=element_markdown(),
        legend.title.align=0.5,
        legend.text.align = 0.5)

# ggsave(filename = "Figures/ddpcr_fig_clean.pdf",
#        dpi = 600, units = "mm",
#        width = 150, height = 100, device = "pdf")

# ggsave(filename = "Figures/ddpcr_fig_clean.png",
#        dpi = 600, units = "mm",
#        width = 150, height = 90)

Statistics from ddPCR assays:

Species Populus tremula and Rhododendron ferrugineum

data_dd_fig %>% filter(sp == "Ptr") %>%
  pull(Concentration_copies_ng) %>% summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  236800  240000  251200  248096  255520  256960
data_dd_fig %>% filter(sp == "Ptr") %>%
  pull(Concentration_copies_ng) %>% sd
## [1] 9171.504
data_dd_fig %>% filter(sp == "Rfe") %>%
  pull(Concentration_copies_ng) %>% summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33152   33680   34912   37483   37176   50720
data_dd_fig %>% filter(sp == "Rfe") %>%
  pull(Concentration_copies_ng) %>% sd
## [1] 6695.412
summary_experiment$Molecules_ng %>% summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   37483   73740   89653  113443  140480  248096

Metabarcoding results

Pipeline from raw fastq data: see data_processing.Rmd file (we used the obitools softwares).

Open data, filter bad PCR replicates:

df_Sper01 <- read_csv("data/df_Sper01.csv")
## Rows: 74708 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sample, id, sequence, Community, species_name
## dbl (8): obiclean_weight, reads, Spike_level, total_count_sample, count, seq...
## lgl (1): NegControl
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary_sample <- df_Sper01 %>%
  group_by(Sample) %>%
  summarise(total_count_sample = sum(reads),
            NegControl = NegControl[1],
            Spike_level = Spike_level[1],
            Community = Community[1]) %>%
  arrange(total_count_sample)

summary(summary_sample$total_count_sample)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     414   12247   31505   35305   54730  108166
summary(summary_sample %>%
          filter(!NegControl) %>% pull(total_count_sample))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1068   13952   33712   37023   55612  108166
sd(summary_sample %>%
          filter(!NegControl) %>% pull(total_count_sample))
## [1] 26763.09
reads_threshold <- 5000 #Replicates with less than 5000 reads are removed

ggplot(summary_sample %>%
          filter(!NegControl))+
  geom_point(aes(x = 1:nrow(summary_sample %>%
          filter(!NegControl)),
                 y = total_count_sample,
                 shape = Community))+
  labs(col = "Spike_level")+
  scale_y_log10()+
  geom_hline(yintercept = reads_threshold)

Data preparation

Observed and expected proportions in final composition.

Community M_U

df_targetU <- df_Sper01 %>%
  filter(Community == "U" &
           species_name %in% barcodes$species_name &
                          total_count_sample > reads_threshold &
                          !NegControl) %>%
  mutate(prop = obiclean_weight/total_count_sample) %>%
  left_join(summary_experiment %>% dplyr::select(species_name, sp, propU)) %>%
  rename(prop_expected = propU)
## Joining with `by = join_by(species_name)`

Community M_T

df_targetT <- df_Sper01 %>%
  filter(Community == "T" &
           species_name %in% barcodes$species_name &
           total_count_sample > reads_threshold &
           !NegControl) %>%
  mutate(prop = obiclean_weight/total_count_sample) %>%
  left_join(summary_experiment %>% dplyr::select(species_name, sp, propT)) %>%
  rename(prop_expected = propT)
## Joining with `by = join_by(species_name)`

Community M_G

df_targetG <- df_Sper01 %>%
  filter(Community == "G" &
           species_name %in% barcodes$species_name &
           total_count_sample > reads_threshold &
           !NegControl) %>%
  mutate(prop = obiclean_weight/total_count_sample) %>%
  left_join(summary_experiment %>% dplyr::select(species_name, sp, propG)) %>%
  rename(prop_expected = propG)
## Joining with `by = join_by(species_name)`

Figure 4: observed and expected proportions

colp <- c("Expected" = "gold",
          "Expected\nwithout\nPCR bias" = "gold",
          "Expected\nwithout\nPCR and\nconcentration\nbiases" = "gold")
          #"Inferred\nwith model" = "darkolivegreen2")

gobsU1 <- ggplot(df_targetU)+
  geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
                   prop))+
      geom_point(data = summary_experiment,
             aes(sp, 1/nrow(summary_experiment),
                 fill = "Expected\nwithout\nPCR bias"),
             shape = 23,
             size = 3)+
    geom_hline(aes(yintercept = 1/nrow(summary_experiment),
                col = "Expected\nwithout\nPCR and\nconcentration\nbiases"),
             linetype = "dashed")+
  ggtitle(TeX("Uniform community (${{M}_U}$)"))+
  labs(y = "", x = "", fill = "", col = "")+
  ylim(min(df_targetT$prop), max(df_targetT$prop)) +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x=element_markdown(),
        legend.title.align=0.5)+
  scale_color_manual(values = colp)+
  scale_fill_manual(values = colp)+
  scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))
  

gobsT1 <- ggplot(df_targetT)+
  geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
                   prop))+
    geom_point(data = summary_experiment,
             aes(sp, propT, fill = "Expected\nwithout\nPCR bias"),
             shape = 23,
             size = 3)+
  geom_hline(aes(yintercept = 1/nrow(summary_experiment),
                col = "Expected\nwithout\nPCR and\nconcentration\nbiases"),
             linetype = "dashed")+
  ggtitle(TeX("Uniform in Total DNA community (${{M}_T}$)"))+
  labs(y = "", x = "", fill = "", col = "")+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x=element_markdown(),
        legend.title.align=0.5)+
  scale_color_manual(values = colp)+
  scale_fill_manual(values = colp)+
    scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))


# gobsG1 <- ggplot(df_targetG)+
#   geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
#                    prop))+
#   labs(y = "", x = "")+
#   ggtitle("Geometric community (G)")+
#   theme(plot.title = element_text(hjust = 0.5))+
#   geom_point(data = summary_experiment,
#              aes(sp, propG), shape = 23,
#              fill = "gold", size = 3)+
#   scale_y_log10()

fig <- ggpubr::ggarrange(gobsU1, gobsT1, #gobsG1,
                  ncol = 1,
                  common.legend = T,
                  legend = "right")
annotate_figure(fig,
# top = text_grob("Final abundances in two mock communities with Sper01"),
                bottom = text_grob("Species"),
                left = text_grob("Observed reads proportions", rot = 90))

# ggsave("Figures/metabar_results.pdf", units = "mm",
#        width = 150, height = 200, dpi = 600, device = "pdf")

Examples of proportions: species Populus tremula

df_targetU %>% filter(sp == "Ptr") %>% pull(prop) %>% summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03278 0.07092 0.08630 0.08618 0.10383 0.13607
df_targetU %>% filter(sp == "Ptr") %>% pull(prop) %>% sd
## [1] 0.02675534
df_targetU %>% group_by(sp) %>% summarise(s = sd(prop), m = mean(prop)) %>%arrange(m)
## # A tibble: 13 × 3
##    sp          s      m
##    <chr>   <dbl>  <dbl>
##  1 Gro   0.00462 0.0241
##  2 Spr   0.00849 0.0267
##  3 Lxy   0.00417 0.0550
##  4 Aal   0.0301  0.0579
##  5 Bme   0.00964 0.0608
##  6 Fex   0.00629 0.0626
##  7 Lco   0.00770 0.0796
##  8 Cbe   0.0122  0.0809
##  9 Aca   0.0207  0.0827
## 10 Ptr   0.0268  0.0862
## 11 Rfe   0.0154  0.101 
## 12 Cbp   0.0129  0.106 
## 13 Rca   0.00872 0.171
df_targetG %>% group_by(sp) %>% summarise(s = sd(prop), m = mean(prop)) %>%arrange(m)
## # A tibble: 13 × 3
##    sp            s         m
##    <chr>     <dbl>     <dbl>
##  1 Aal   0.0000299 0.0000509
##  2 Rfe   0.000123  0.000177 
##  3 Gro   0.000141  0.000228 
##  4 Cbe   0.000131  0.000312 
##  5 Aca   0.000473  0.00166  
##  6 Cbp   0.000485  0.00189  
##  7 Spr   0.00206   0.00625  
##  8 Fex   0.00150   0.00676  
##  9 Lxy   0.00181   0.00962  
## 10 Ptr   0.0228    0.0551   
## 11 Lco   0.00849   0.148    
## 12 Bme   0.0386    0.366    
## 13 Rca   0.0136    0.400

Ratio correction from a reference mock community (M_U)

This approach considers that the factor bias is independent to the whole community composition.

Correction factor computed from community M_U is:

a_s = prop_obs(s)/prop_init(s) (* constant value)

then prop_infer_mock(s) = prop_obs(s)/a_s(M_U)

where prop_infer_mock are the inferred proportions established by this ratio correction.

order_sp <- summary_experiment %>% dplyr::select(species_name, RankG) %>%
  arrange(-RankG) %>% pull(species_name)

correcU <- df_targetU %>%
  group_by(species_name) %>%
  summarise(correc = median(prop)/prop_expected[1], sp = sp[1])

#M_U
df_targetU <- df_targetU %>% left_join(correcU)
## Joining with `by = join_by(species_name, sp)`
df_targetU$prop_infer_mock <- df_targetU$prop/df_targetU$correc
df_targetU <- df_targetU %>%
  group_by(Sample) %>%
  mutate(prop_infer_mock = prop_infer_mock/sum(prop_infer_mock,
                                               na.rm = T))

df_targetU$species_name <- factor(df_targetU$species_name,
                                  levels = order_sp,
                                  ordered = T)

#M_T
df_targetT <- df_targetT %>% left_join(correcU)
## Joining with `by = join_by(species_name, sp)`
df_targetT$prop_infer_mock <- df_targetT$prop/df_targetT$correc
df_targetT <- df_targetT %>%
  group_by(Sample) %>%
  mutate(prop_infer_mock = prop_infer_mock/sum(prop_infer_mock,
                                               na.rm = T))
df_targetT$species_name <- factor(df_targetT$species_name,
                                  levels = order_sp,
                                  ordered = T)

#M_G
df_targetG <- df_targetG %>% left_join(correcU)
## Joining with `by = join_by(species_name, sp)`
df_targetG$prop_infer_mock <- df_targetG$prop/df_targetG$correc
df_targetG <- df_targetG %>%
  group_by(Sample) %>%
  mutate(prop_infer_mock = prop_infer_mock/sum(prop_infer_mock,
                                               na.rm = T))

df_targetG$species_name <- factor(df_targetG$species_name,
                                  levels = order_sp,
                                  ordered = T)
# ginferT1 <- ggplot(df_targetT)+
#   geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
#                    prop_infer_mock))+
#   geom_point(data = df_targetT %>% group_by(sp) %>%
#                summarise(prop = median(prop)),
#     aes(sp, prop, fill = "Observed\n(Median)"),
#              shape = 21, size = 3)+
#   theme(axis.text.x=element_blank())+
#   labs(y = "", x = "",
#        fill = "", col = "")+
#   geom_point(data = summary_experiment,
#              aes(sp, propT, fill = "Expected"), shape = 23,
#              size = 3)+
#   scale_fill_manual(values = colp)+
#   ggtitle("Uniform in Total DNA community (T)")+
#   theme(plot.title = element_text(hjust = 0.5))
# 
# ginferG1 <- ggplot(df_targetG)+
#   geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
#                    prop_infer_mock))+
#   geom_point(data = df_targetG %>% group_by(sp) %>%
#                summarise(prop = median(prop)),
#              aes(sp, prop, fill = "Observed\n(Median)"),
#              shape = 21, size = 3)+
#   labs(y = "", x = "",
#        fill = "", col = "")+
#   geom_point(data = summary_experiment,
#              aes(sp, propG, fill = "Expected"), shape = 23,
#              size = 3)+
#   scale_fill_manual(values = colp)+
#   ggtitle("Geometric community (G)")+
#   theme(plot.title = element_text(hjust = 0.5),
#         legend.position = "none")+
#   scale_y_log10()+
#   scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))+
#   theme(axis.text.x=element_markdown())
# 
# fig <- egg::ggarrange(ginferT1, ginferG1,
#                       ncol = 1)
# annotate_figure(fig,
#                 top = text_grob("Corrected abundances in two mock communities with Sper01"),
#                 bottom = text_grob("Species"),
#                 left = text_grob("Inferred reads proportions", rot = 90))

# ggsave("Figures/metabar_results_corrected.pdf", units = "mm",
#        width = 180, height = 150, dpi = 600, device = "pdf")

Merge the results

resU1 <- df_targetU %>% group_by(species_name) %>%
  summarise(prop = median(prop), prop_expected = prop_expected[1],
            prop_infer_mock = median(prop_infer_mock))

resT1 <- df_targetT %>% group_by(species_name) %>%
  summarise(prop = median(prop), prop_expected = prop_expected[1],
            prop_infer_mock = median(prop_infer_mock))

resG1 <- df_targetG %>% group_by(species_name) %>%
  summarise(sp = sp[1], prop = median(prop), prop_expected = prop_expected[1],
            prop_infer_mock = median(prop_infer_mock))

Efficiencies inference

Export data to Julia

Export data to Julia to infer efficiencies and proportions with flimo

Files metabar_bias_functions.jl and inference_known_K.jl

readsU <- df_targetU %>%
  dplyr::select(Sample, obiclean_weight, species_name) %>%
  pivot_wider(names_from = species_name,
              values_from = obiclean_weight) %>% ungroup %>% dplyr::select(-Sample)

qty_initU <- left_join(data.frame(species_name = colnames(readsU)),
                       summary_experiment %>%
                         dplyr::select(species_name, molecules_U)) %>%
              pull(molecules_U)*2 #2ul
## Joining with `by = join_by(species_name)`
# write_csv(readsU, "data/reads_U.csv")
# write_csv(data.frame(qty_initU), "data/qty_initU.csv")

readsT <- df_targetT %>%
  dplyr::select(Sample, obiclean_weight, species_name) %>%
  pivot_wider(names_from = species_name,
              values_from = obiclean_weight) %>% ungroup %>% dplyr::select(-Sample)

qty_initT <- left_join(data.frame(species_name = colnames(readsU)),
                       summary_experiment %>%
                         dplyr::select(species_name, molecules_T)) %>%
              pull(molecules_T)*2 #2ul
## Joining with `by = join_by(species_name)`
# write_csv(readsT, "data/reads_T.csv")
# write_csv(data.frame(qty_initT), "data/qty_initT.csv")

readsG <- df_targetG %>%
  dplyr::select(Sample, obiclean_weight, species_name) %>%
  pivot_wider(names_from = species_name,
              values_from = obiclean_weight,
              values_fill = 0) %>% ungroup %>% dplyr::select(-Sample)

qty_initG <- left_join(data.frame(species_name = colnames(readsU)),
                       summary_experiment %>%
                         dplyr::select(species_name, molecules_G)) %>%
              pull(molecules_G) #2ul
## Joining with `by = join_by(species_name)`
# write_csv(readsG, "data/reads_G.csv")
# write_csv(data.frame(qty_initG), "data/qty_initG.csv")

Proportions corrected with PCR efficiencies

Proportions inferred in Julia using PCR efficiencies

Run metabar_bias_functions.jl (functions), inference_known_K.jl (inference of efficiencies from M_U and then proportions in M_T and M_G) and inference_taqman.jl (processing the taqman analysis)

Inference with the flimo method - reads in M_U used to infer efficiencies Lambda_s - then efficiencies used to infer proportions in M_T and M_G

Efficiencies

#Inferred efficiencies
efficiencies_infer <- cbind(1,
                            read_csv("data/efficiencies_U_K1.csv"))
## Rows: 10 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): Column1, Column2, Column3, Column4, Column5, Column6, Column7, Col...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(efficiencies_infer) <- colnames(readsU)

apply(efficiencies_infer, 2, sd)
##              Rosa_canina              Briza_media       Lotus_corniculatus 
##             0.000000e+00             9.907679e-05             9.238455e-05 
##          Populus_tremula           Acer_campestre Rhododendron_ferrugineum 
##             1.793031e-04             1.558658e-04             1.741635e-04 
##  Capsella_bursa-pastoris       Fraxinus_excelsior         Carpinus_betulus 
##             1.192102e-04             1.402829e-04             1.809176e-04 
##       Lonicera_xylosteum               Abies_alba         Salvia_pratensis 
##             1.448291e-04             2.081637e-04             1.133038e-04 
##     Geranium_robertianum 
##             3.009810e-04
summary_experiment <- summary_experiment %>%
  left_join(efficiencies_infer %>%
              as_tibble() %>%
              pivot_longer(1:13, names_to = "species_name",
                           values_to = "efficiencies") %>%
              group_by(species_name) %>%
              summarise(efficiencies = mean(efficiencies)))
## Joining with `by = join_by(species_name)`

Inferred proportions in M_T and M_G

Loading inferred proportions computed in Julia

inferT <- read.csv("data/prop_inferT_K1.csv")
colnames(inferT) <- colnames(readsT)
resT1 <- resT1 %>%
  left_join(inferT %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
              group_by(species_name) %>%
              summarise(prop_infer_Lambda = mean(prop_infer_Lambda))) %>%
  arrange(-dplyr::row_number())
## Joining with `by = join_by(species_name)`
inferT %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
              group_by(species_name) %>%
              summarise(prop = mean(prop_infer_Lambda),
                        sd = sd(prop_infer_Lambda)/prop)
## # A tibble: 13 × 3
##    species_name               prop      sd
##    <chr>                     <dbl>   <dbl>
##  1 Abies_alba               0.0213 0.00639
##  2 Acer_campestre           0.0816 0.00415
##  3 Briza_media              0.0831 0.00243
##  4 Capsella_bursa-pastoris  0.0247 0.00867
##  5 Carpinus_betulus         0.0268 0.00437
##  6 Fraxinus_excelsior       0.0564 0.00215
##  7 Geranium_robertianum     0.0714 0.0238 
##  8 Lonicera_xylosteum       0.0389 0.00811
##  9 Lotus_corniculatus       0.116  0.00336
## 10 Populus_tremula          0.203  0.00486
## 11 Rhododendron_ferrugineum 0.0385 0.00899
## 12 Rosa_canina              0.123  0.0206 
## 13 Salvia_pratensis         0.116  0.0227
inferG <- read.csv("data/prop_inferG_K1.csv")
colnames(inferG) <- colnames(readsG)
resG1 <- resG1 %>%
  left_join(inferG %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
              group_by(species_name) %>%
              summarise(prop_infer_Lambda = mean(prop_infer_Lambda))) %>%
  arrange(-dplyr::row_number())
## Joining with `by = join_by(species_name)`
inferG %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
              group_by(species_name) %>%
              summarise(prop = mean(prop_infer_Lambda),
                        sd_mean = sd(prop_infer_Lambda)/prop)
## # A tibble: 13 × 3
##    species_name                  prop sd_mean
##    <chr>                        <dbl>   <dbl>
##  1 Abies_alba               0.0000489 0.0703 
##  2 Acer_campestre           0.00175   0.0103 
##  3 Briza_media              0.530     0.00645
##  4 Capsella_bursa-pastoris  0.00156   0.0126 
##  5 Carpinus_betulus         0.000336  0.0309 
##  6 Fraxinus_excelsior       0.00953   0.00715
##  7 Geranium_robertianum     0.000839  0.0475 
##  8 Lonicera_xylosteum       0.0155    0.0122 
##  9 Lotus_corniculatus       0.162     0.00131
## 10 Populus_tremula          0.0558    0.00247
## 11 Rhododendron_ferrugineum 0.000144  0.0342 
## 12 Rosa_canina              0.202     0.0208 
## 13 Salvia_pratensis         0.0210    0.0319

Taqman assay

In this section, efficiencies of 3 species are estimated from a Taqman qPCR assay.

Open data (prepared in data_processing.Rmd)

res_taqman <- read.csv("data/res_taqman.csv")
kinetics <- read.csv("data/taqman_kinetics.csv")

Plot amplification curves and Cq = f(concentration)

ggplot(kinetics)+
  geom_line(aes(Cycle, RFU, group = Well, col = probe))

ggplot(res_taqman)+
  geom_point(aes(copies_ul, Cq, group = Well, col = probe))+
  scale_x_log10()+
  labs(y="Ct")+
  ggtitle("Ct = f(log(Concentration))")

First step: linear regression

Vmix_taq <- 25 #ul

res_taqman$slope <- NA
res_taqman$intercept <- NA

for (s in unique(res_taqman$probe)){
  reg <- lm(Cq ~ log10(copies),
            res_taqman %>%
              filter(probe == s) %>%
              group_by(Dilu) %>%
              summarise(Cq = mean(Cq),
                        copies = Vmix_taq*copies_ul[1]))
  res_taqman[res_taqman$probe == s, "slope"] <- reg$coefficients[2]
  res_taqman[
    res_taqman$probe == s, "intercept"] <- reg$coefficients[1]
}

res_taqman$rate <- 10^(-1/res_taqman$slope)-1

res_taqman$Mct <- 10^(-res_taqman$intercept/res_taqman$slope)

res_taqman %>% group_by(probe) %>%
  summarise(rate = rate[1],
            Mct = Mct[1]/1e11)
## # A tibble: 4 × 3
##   probe  rate   Mct
##   <chr> <dbl> <dbl>
## 1 CbeA  0.985  1.76
## 2 CbeB  0.979  2.31
## 3 Cbp   0.968  1.08
## 4 Fex   0.930  1.26

Try to infer with only 2 out of the 3 replicates : too important variability

flam <- function(p){
  10^(-1/p)-1
}

res_lm_test <- NULL

for (s in unique(res_taqman$probe)){
  for (try in 1:100){
    reg <- lm(Cq ~ log10(copies),
              res_taqman %>%
                filter(probe == s & !Neg) %>%
                group_by(Dilu) %>%
                summarise(Cq = mean(Cq[sample(1:n(), 2)]),
                          copies = Vmix_taq*copies_ul[1]))
    res_lm_test <- rbind(res_lm_test,
                         data.frame(probe = s,
                                    slope = reg$coefficients[2]))
  }
}

res_lm_test %>% group_by(probe) %>% summarise(min = min(slope),
                                              max = max(slope)) %>%
  mutate(lmin = flam(min), lmax = flam(max), gap = lmax - lmin)
## # A tibble: 4 × 6
##   probe   min   max  lmin  lmax    gap
##   <chr> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 CbeA  -3.39 -3.31 0.972 1.01  0.0347
## 2 CbeB  -3.40 -3.34 0.970 0.991 0.0202
## 3 Cbp   -3.45 -3.36 0.950 0.986 0.0359
## 4 Fex   -3.57 -3.45 0.905 0.950 0.0447

Estimate constants of the model

PCR conditions for Taqman qPCR are different from those for metabarcoding: K is therefore recalculated (not used in the analysis).

#25ul/well:
#12.5ul qPCR mix : iTaq Universal Probes Supermix (Biorad)
#1.5ul primers F+R at 5uM each
#5.375ul H20
#0.625ul probes
#5ul DNA

c_n <- 4*0.2e-3 #concentration of dNTP, mol/l (4*0.2mM) ***à vérifier
Vtaq <- 25e-6 #volume of a well, l
N_A <- 6.02214076e23 #Avogadro
Ltaq <- barcodes %>% filter(species_name %in%
                              c("Carpinus_betulus",
                                "Capsella_bursa-pastoris",
                                "Fraxinus_excelsior")) %>%
  pull(sequence) %>% nchar %>% mean
#mean length of a sequence, number of nucleotides

n_p_taq <- 10e-6*1.5e-6 #quantity of primers, mol : 2 (F+R) * 10uM * 0.75ul 

Ktaq <- min(c_n*Vtaq/Ltaq, n_p_taq)*N_A
Ktaq <- n_p_taq*N_A
Ktaq
## [1] 9.033211e+12

RFU Threshold for each well

res_taqman$RFU_Cq <- NA

for (i in 1:nrow(res_taqman)){
  aux <- kinetics %>% filter(Dilu == res_taqman[i,]$Dilu &
                      Repli == res_taqman[i,]$Repli &
                      probe == res_taqman[i,]$probe &
                      abs(Cycle - res_taqman[i,]$Cq)<1)

res_taqman[i,"RFU_Cq"] <- (aux[2,]$RFU - aux[1,]$RFU)*
  (res_taqman[i,]$Cq-aux[1,]$Cycle)+aux[1,]$RFU
}

# res_taqman <- res_taqman %>%
#   mutate(K = Mct/(RFU_Cq/End.RFU))

Mct_glob <- res_taqman %>%
  group_by(sp) %>%
  summarise(Mct = mean(Mct)) %>% pull(Mct) %>% mean

# K <- res_taqman %>%
#   group_by(sp) %>%
#   summarise(K = mean(K)) %>% pull(K) %>% sum

Computing PCR efficiencies

efficiencies <- res_taqman %>%
  mutate(Lambda = (Mct_glob/(Vmix_taq*copies_ul))^(1/Cq)-1)

ggplot(efficiencies)+
  geom_point(aes(probe, Lambda, col = factor(Dilu)))

efficiencies %>% group_by(probe) %>% summarise(Lambda = mean(Lambda))
## # A tibble: 4 × 2
##   probe Lambda
##   <chr>  <dbl>
## 1 CbeA   0.972
## 2 CbeB   0.947
## 3 Cbp    0.989
## 4 Fex    0.940
efficiencies %>% group_by(sp) %>% summarise(Lambda = mean(Lambda))
## # A tibble: 3 × 2
##   sp    Lambda
##   <chr>  <dbl>
## 1 Cbe    0.960
## 2 Cbp    0.989
## 3 Fex    0.940
#lam = Lambdas = efficiencies:
lam <- efficiencies %>% group_by(sp) %>%
  summarise(efficiency = mean(Lambda),
            sdeff = sd(Lambda))

res_taqman <- res_taqman %>% left_join(lam)
## Joining with `by = join_by(sp)`

Normalise End.RFU for Cbe

We normalise RFU such that End.RFU(CbeA) = End.RFU(CbeB) to compare the 2 probes for Carpinus betulus

#Plot Cq = f(inital quantity)
ggplot(res_taqman %>% filter(!Neg & sp == "Cbe"))+
  geom_point(aes(Dilu, Cq, col = probe))

#Averaging End.RFU and Cq for the two probes
res_cbe_avg <- res_taqman %>% filter(sp == "Cbe" & Dilu < 6) %>%
  group_by(probe, Dilu) %>%
  summarise(end.rfu = mean(End.RFU), Cq = mean(Cq))
## `summarise()` has grouped output by 'probe'. You can override using the
## `.groups` argument.
#Normalising factor
correcCbe <- res_cbe_avg %>% group_by(Dilu, probe) %>%
  summarise(end.rfu = mean(end.rfu)) %>%
  pivot_wider(names_from = probe, values_from = end.rfu) %>%
  mutate(correc = CbeA/CbeB)
## `summarise()` has grouped output by 'Dilu'. You can override using the
## `.groups` argument.
#Normalising kinetics for CbeB
kinetics_cbe <- kinetics %>% filter(sp == "Cbe" & !Neg) %>%
  left_join(correcCbe) %>%
  mutate(RFU_correc = ifelse(probe == "CbeB", RFU*correc, RFU))
## Joining with `by = join_by(Dilu)`
ggplot(kinetics_cbe)+
  geom_line(aes(Cycle, RFU, col = probe, group = Well,
                linetype = factor(Dilu)))

ggplot(kinetics_cbe)+
  geom_line(aes(Cycle, RFU_correc, col = probe, group = Well,
                linetype = factor(Dilu)))

res_cbe <- res_taqman %>% filter(sp == "Cbe" & !Neg)

res_cbe$RFU_Cq <- NA
for (i in 1:nrow(res_cbe)){
  aux <- kinetics %>% filter(Dilu == res_cbe[i,]$Dilu &
                      Repli == res_cbe[i,]$Repli &
                      probe == res_cbe[i,]$probe &
                      abs(Cycle - res_cbe[i,]$Cq)<1)

res_cbe[i,"RFU_Cq"] <- (aux[2,]$RFU - aux[1,]$RFU)*
  (res_cbe[i,]$Cq-aux[1,]$Cycle)+aux[1,]$RFU
}

seuil_Cq <- mean(res_cbe$RFU_Cq)

#Recompute Cq for the normalised RFU
fCq <- function(ampli){
  c2 <- which(ampli >= seuil_Cq)[1]
  c1 <- c2-1
  #linear interpolation
  Cq <- (seuil_Cq - ampli[c1]+(ampli[c2] - ampli[c1])*c1)/
    (ampli[c2] - ampli[c1])
  Cq
}

res_taqman_correc <- NULL
for (p in unique(kinetics_cbe$Well)){
  res_taqman_correc <- rbind(res_taqman_correc,
               data.frame(Well = p,
                          probe = kinetics_cbe[
                            kinetics_cbe$Well == p,]$probe[1],
                          Cq = fCq(kinetics_cbe[
                            kinetics_cbe$Well == p,]$RFU),
                          Cq_correc = fCq(kinetics_cbe[
                            kinetics_cbe$Well == p,]$RFU_correc)))
}

# ggplot(res_taqman_correc)+
#   geom_point(aes(Cq_correc, Cq, col = probe))+
#   geom_abline()

Amplification bias

Comparing the 3 species from the M_U community (amplified within the 13 species pool)

qmet <- df_targetU %>%
  filter(species_name %in% c("Capsella_bursa-pastoris",
                             "Carpinus_betulus",
                             "Fraxinus_excelsior"))

ggplot(qmet)+
  geom_boxplot(aes(species_name,
                   prop))+
  labs(x = "Species", y = "Proportions in M_U")+
  theme(axis.text.x = element_text(angle = 90))

#Renormalized ps for the 3 species from M_U
qmet %>% group_by(species_name) %>%
  summarise(prop = mean(prop)) %>% mutate(prop = prop/sum(prop))
## # A tibble: 3 × 2
##   species_name             prop
##   <ord>                   <dbl>
## 1 Carpinus_betulus        0.325
## 2 Capsella_bursa-pastoris 0.424
## 3 Fraxinus_excelsior      0.251

Table 2: Amplification efficiencies and proportions in M_U

Summarising previous information

resU1 %>% dplyr::select(species_name, prop_expected, prop)
## # A tibble: 13 × 3
##    species_name             prop_expected   prop
##    <ord>                            <dbl>  <dbl>
##  1 Rhododendron_ferrugineum        0.0769 0.105 
##  2 Abies_alba                      0.0769 0.0531
##  3 Carpinus_betulus                0.0769 0.0850
##  4 Geranium_robertianum            0.0769 0.0256
##  5 Capsella_bursa-pastoris         0.0769 0.108 
##  6 Acer_campestre                  0.0769 0.0803
##  7 Fraxinus_excelsior              0.0769 0.0613
##  8 Lonicera_xylosteum              0.0769 0.0550
##  9 Salvia_pratensis                0.0769 0.0261
## 10 Populus_tremula                 0.0769 0.0863
## 11 Lotus_corniculatus              0.0769 0.0785
## 12 Rosa_canina                     0.0769 0.172 
## 13 Briza_media                     0.0769 0.0578
#Efficiencies by Taqman
lam
## # A tibble: 3 × 3
##   sp    efficiency   sdeff
##   <chr>      <dbl>   <dbl>
## 1 Cbe        0.960 0.0141 
## 2 Cbp        0.989 0.00501
## 3 Fex        0.940 0.0108
#Normalized efficiencies
efficiencies %>% group_by(probe) %>%
    summarise(efficiency = mean(Lambda),
              sdeff = sd(Lambda)) %>%
  mutate(efficiency = efficiency/lam$efficiency[3]*0.927, #Fex
         sdeff = sdeff/lam$efficiency[3]*0.927) #Fex
## # A tibble: 4 × 3
##   probe efficiency   sdeff
##   <chr>      <dbl>   <dbl>
## 1 CbeA       0.959 0.00553
## 2 CbeB       0.934 0.00602
## 3 Cbp        0.976 0.00494
## 4 Fex        0.927 0.0107
summary_experiment %>% arrange(RankG) %>%
  dplyr::select(species_name, efficiencies)
## # A tibble: 13 × 2
##    species_name             efficiencies
##    <chr>                           <dbl>
##  1 Briza_media                     0.925
##  2 Rosa_canina                     1    
##  3 Lotus_corniculatus              0.944
##  4 Populus_tremula                 0.950
##  5 Salvia_pratensis                0.867
##  6 Lonicera_xylosteum              0.918
##  7 Fraxinus_excelsior              0.927
##  8 Acer_campestre                  0.947
##  9 Capsella_bursa-pastoris         0.965
## 10 Geranium_robertianum            0.860
## 11 Carpinus_betulus                0.946
## 12 Abies_alba                      0.921
## 13 Rhododendron_ferrugineum        0.962

Table 3: proportions in M_T and M_G

Inferred proportions in M_T and M_G

Proportions

Inferred with the ratio method or the new approach in this paper

resT1
## # A tibble: 13 × 5
##    species_name             prop prop_expected prop_infer_mock prop_infer_Lambda
##    <chr>                   <dbl>         <dbl>           <dbl>             <dbl>
##  1 Briza_media            0.0620        0.0851          0.0842            0.0831
##  2 Rosa_canina            0.262         0.144           0.123             0.123 
##  3 Lotus_corniculatus     0.114         0.0953          0.115             0.116 
##  4 Populus_tremula        0.210         0.168           0.198             0.203 
##  5 Salvia_pratensis       0.0385        0.104           0.118             0.116 
##  6 Lonicera_xylosteum     0.0269        0.0500          0.0385            0.0389
##  7 Fraxinus_excelsior     0.0429        0.0590          0.0568            0.0564
##  8 Acer_campestre         0.0793        0.0913          0.0781            0.0816
##  9 Capsella_bursa-pastor… 0.0322        0.0261          0.0245            0.0247
## 10 Geranium_robertianum   0.0209        0.0558          0.0653            0.0714
## 11 Carpinus_betulus       0.0276        0.0357          0.0258            0.0268
## 12 Abies_alba             0.0145        0.0608          0.0227            0.0213
## 13 Rhododendron_ferrugin… 0.0506        0.0254          0.0371            0.0385
resG1
## # A tibble: 13 × 6
##    species_name    sp       prop prop_expected prop_infer_mock prop_infer_Lambda
##    <chr>           <chr>   <dbl>         <dbl>           <dbl>             <dbl>
##  1 Briza_media     Bme   3.64e-1      0.500          0.536             0.530    
##  2 Rosa_canina     Rca   4.01e-1      0.250          0.200             0.202    
##  3 Lotus_cornicul… Lco   1.48e-1      0.125          0.160             0.162    
##  4 Populus_tremula Ptr   5.20e-2      0.0625         0.0516            0.0558   
##  5 Salvia_pratens… Spr   6.03e-3      0.0313         0.0199            0.0210   
##  6 Lonicera_xylos… Lxy   9.38e-3      0.0156         0.0147            0.0155   
##  7 Fraxinus_excel… Fex   6.84e-3      0.00781        0.00955           0.00953  
##  8 Acer_campestre  Aca   1.60e-3      0.00391        0.00170           0.00175  
##  9 Capsella_bursa… Cbp   1.92e-3      0.00195        0.00151           0.00156  
## 10 Geranium_rober… Gro   1.95e-4      0.000977       0.000643          0.000839 
## 11 Carpinus_betul… Cbe   3.03e-4      0.000488       0.000306          0.000336 
## 12 Abies_alba      Aal   4.50e-5      0.000244       0.0000718         0.0000489
## 13 Rhododendron_f… Rfe   1.45e-4      0.000122       0.000119          0.000144

Proportions with exponential model

Computation of neff for M_U (number of effective exponential cycles) with the truncated exponential model

The K value used is the one selected during the inference in flimo.

summary_experiment <- summary_experiment %>%
  left_join(resU1 %>% dplyr::select(species_name, prop) %>%
                  rename(prop_obsU = prop)) %>%
  left_join(resT1 %>% dplyr::select(species_name, prop) %>%
              rename(prop_obsT = prop)) %>%
  left_join(resG1 %>% dplyr::select(species_name, prop) %>%
              rename(prop_obsG = prop))
## Joining with `by = join_by(species_name)`
## Joining with `by = join_by(species_name)`
## Joining with `by = join_by(species_name)`
#mg stands for Geometric Mean

mg_lam_s_p1 <- exp(mean(log(summary_experiment$efficiencies+1))) #Mean of Lambda_s + 1

#M_U

mg_psU <- exp(mean(log(summary_experiment$prop_obsU))) #Mean of relative abundances ps

mg_m0sU <- exp(mean(log(VDNA_metab*summary_experiment$molecules_U))) #molecules/ug ; Mean of initial numbers of molecules m0s

neffU <- (log(mg_psU)+log(K)-log(mg_m0sU))/log(mg_lam_s_p1)

#M_T

mg_psT <- exp(mean(log(summary_experiment$prop_obsT)))
mg_m0sT <- exp(mean(log(VDNA_metab*summary_experiment$molecules_T))) #molecules/ug
neffT <- (log(mg_psT)+log(K)-log(mg_m0sT))/log(mg_lam_s_p1)

#M_G

mg_psG <- exp(mean(log(summary_experiment$prop_obsG)))
mg_m0sG <- exp(mean(log(VDNA_metab*summary_experiment$molecules_G))) #molecules/ug
neffG <- (log(mg_psG)+log(K)-log(mg_m0sG))/log(mg_lam_s_p1)

Cycles <- 1:40
res_exp <- NULL
for (cyc in Cycles){
    m0s <- summary_experiment$prop_obsU/
      (1+summary_experiment$efficiencies)^cyc
  psU <- m0s/sum(m0s)
  
      m0s <- summary_experiment$prop_obsT/
        (1+summary_experiment$efficiencies)^cyc
  psT <- m0s/sum(m0s)
  
      m0s <- summary_experiment$prop_obsG/
        (1+summary_experiment$efficiencies)^cyc
  psG <- m0s/sum(m0s)
  
  res_exp <- rbind(res_exp, data.frame(sp = summary_experiment$sp,
                                       cycle = cyc,
                                       psU = psU, psT = psT, psG = psG))

}

# res_exp %>% View

ggplot(res_exp)+
  geom_line(aes(cycle, psU, col = sp))+
  geom_hline(yintercept = 1/13)+
  geom_vline(xintercept = neffU)

ggplot(res_exp)+
  geom_line(aes(cycle, psT, col = sp))

ggplot(res_exp)+
  geom_line(aes(cycle, psG, col = sp))

Estimation error: RMSE values

RMSE (root mean squared error) to compare observed/corrected vs expected proportions Either absolute or relative error

AbsErr <- function(p_observed, p_expected){
  sqrt(sum((p_observed-p_expected)^2)/length(p_observed))
}

RelErr <- function(p_observed, p_expected){
  sqrt(sum(((p_observed-p_expected)/p_expected)^2)/length(p_observed))
}

#U
summary(resU1$prop)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02565 0.05499 0.07852 0.07646 0.08630 0.17207
AbsErr(resU1$prop, resU1$prop_expected)
## [1] 0.03702942
RelErr(resU1$prop, resU1$prop_expected)
## [1] 0.4813825
#T
summary(resT1$prop)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01454 0.02758 0.04289 0.07551 0.07932 0.26184
AbsErr(resT1$prop, resT1$prop_expected)
## [1] 0.04465347
AbsErr(resT1$prop_infer_mock, resT1$prop_expected)
## [1] 0.01741129
AbsErr(resT1$prop_infer_Lambda, resT1$prop_expected)
## [1] 0.01855092
RelErr(resT1$prop, resT1$prop_expected)
## [1] 0.5269626
RelErr(resT1$prop_infer_mock, resT1$prop_expected)
## [1] 0.2631424
RelErr(resT1$prop_infer_Lambda, resT1$prop_expected)
## [1] 0.2797864
#RMSE of T if it is considered as a uniform community
AbsErr(resT1$prop, resU1$prop_expected)
## [1] 0.07382099
RelErr(resT1$prop, resU1$prop_expected)
## [1] 0.9596729
#G
summary(resG1$prop)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000450 0.0003027 0.0060311 0.0762360 0.0519728 0.4009659
AbsErr(resG1$prop, resG1$prop_expected)
## [1] 0.05724279
AbsErr(resG1$prop_infer_mock, resG1$prop_expected)
## [1] 0.02026261
AbsErr(resG1$prop_infer_Lambda, resG1$prop_expected)
## [1] 0.01912173
RelErr(resG1$prop, resG1$prop_expected)
## [1] 0.4930014
RelErr(resG1$prop_infer_mock, resG1$prop_expected)
## [1] 0.3358845
RelErr(resG1$prop_infer_Lambda, resG1$prop_expected)
## [1] 0.3319039

Inference at neffU : if the amplification is exponential and is stopped after neffU cycles, the estimated proportions are psU, psT and psG (below). Associated RMSE.

m0s <- summary_experiment$prop_obsU/
  (1+summary_experiment$efficiencies)^neffU
psU <- m0s/sum(m0s)
  
m0s <- summary_experiment$prop_obsT/
  (1+summary_experiment$efficiencies)^neffU
psT <- m0s/sum(m0s)
  
m0s <- summary_experiment$prop_obsG/
  (1+summary_experiment$efficiencies)^neffU
psG <- m0s/sum(m0s)

AbsErr(psU, summary_experiment$propU)
## [1] 0.002992818
RelErr(psU, summary_experiment$propU)
## [1] 0.03890663
AbsErr(psT, summary_experiment$propT)
## [1] 0.01816503
RelErr(psT, summary_experiment$propT)
## [1] 0.2911542
AbsErr(psG, summary_experiment$propG)
## [1] 0.01765036
RelErr(psG, summary_experiment$propG)
## [1] 0.3270823

###Associated biodiversity measures: Hill numbers

Computing the Hill numbers to evaluate the influence of correction on ecological conclusions.

#Hill
D <- function(ps, q){
  ps <- ps/sum(ps)
  ps <- ps[ps>0]
  ifelse(q == 1, exp(-sum(ps*log(ps))), sum(ps^q)^(1/(1-q)))
}

#Community T

D(resT1$prop_expected, 1)
## [1] 11.26119
D(resT1$prop, 1)
## [1] 8.868797
D(resT1$prop_infer_mock, 1)
## [1] 10.64034
D(resT1$prop_infer_Lambda, 1)
## [1] 10.63821
D(resT1$prop_expected, 2)
## [1] 10.02984
D(resT1$prop, 2)
## [1] 6.647851
D(resT1$prop_infer_mock, 2)
## [1] 9.125834
D(resT1$prop_infer_Lambda, 2)
## [1] 9.117803
#Community G

D(resG1$prop_expected, 1)
## [1] 3.995114
D(resG1$prop, 1)
## [1] 3.70669
D(resG1$prop_infer_mock, 1)
## [1] 3.733344
D(resG1$prop_infer_Lambda, 1)
## [1] 3.808069
D(resG1$prop_expected, 2)
## [1] 2.999268
D(resG1$prop, 2)
## [1] 3.089144
D(resG1$prop_infer_mock, 2)
## [1] 2.784234
D(resG1$prop_infer_Lambda, 2)
## [1] 2.845948

Plot proportions for G

ginferG1 <- ggplot(df_targetG)+
  geom_boxplot(aes(factor(sp, levels = order_short,
                          ordered = T),
                   prop))+
    geom_point(data = summary_experiment,
             aes(sp, propG, fill = "Expected"), shape = 23,
             size = 2)+
    geom_point(data = resG1, aes(sp, prop_infer_Lambda, fill = "Inferred\nwith model"),
             shape = 23, size = 2)+
  labs(y = "Proportions", x = "Species",
       fill = "", col = "")+
  scale_fill_manual(values = colp)+
  ggtitle("Abundances in the Geometric community (G)")+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_y_log10()+
  scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))+
  theme(axis.text.x=element_markdown())

ginferG1

# ggsave("Figures/metabar_results_corrected_G.png", units = "mm",
#        width = 150, height = 90, dpi = 600)

Figure 5: Simulations: impact of efficiency variations

Simulated amplification to quantify the effect of efficiency differences.

Preparing simulations

pcr_sim : Function to simulate PCR with multiple species

pcr_sim <- function(m0=1,
                    lambda=rep(1, length(m0)),
                    K=1e13,
                    cycles = 40,
                    satu = c("log", "meca"), #meca not introduced in paper
                    c = 0.9, #for meca
                    e = 1.1, #for meca
                    delta = 1) { #for initial overdispersion
  param <- data.frame(m0 = m0,
                      lambda = lambda)
  nspecies <- nrow(param)
  
  #Number of molecules at each cycle for each species
  kinetics <- matrix(0, nrow = cycles+1, ncol = nspecies)
  if (delta <= 1){
    for (m in 1:nspecies){
      kinetics[1,m] <- rpois(1, m0[m])
    }
  } else {
    for (m in 1:nspecies){
      kinetics[1,m] <- rnbinom(1, m0[m]/delta, 1/delta)
    }
  }
  for (cyc in 2:(1+cycles)) {
    charge <- sum(kinetics[cyc-1,]-kinetics[1,])/K
    for (m in 1:nspecies) {
      lam <- lambda[m]
      mc <- kinetics[cyc-1,m] #Molecules of s at previous cycle
      if(satu[1] == "meca"){ #This model is not introduced in the paper
        gamma <- function(x){
          2/(1+sqrt(1-4*c^2*x*(1-x)))
        }
        lam <- lam*c*gamma(charge)*(1-charge)*(e-charge)
      }
      else{ #logistic
        lam <- max(0, lam*(1-charge))
      }
      #Number of created molecules
      nb <- rbinom(1, mc, lam)
      kinetics[cyc,m] <- kinetics[cyc-1,m]+nb
    }
  }
  return(kinetics)
}

nspecies <- 2
ncycles <- 40
M0 = rep(2.5e5/nspecies, nspecies) #Total number of molecules: 2.5e5

#Tested values
lam1 <- 1 #Efficiency of species 1
Lam2 <- seq(0.8, 1, by = 0.001) #Different efficiencies of species 2

res <- NULL
for (lam2 in Lam2){
  p <- pcr_sim(m0=M0, lambda=c(lam1, lam2), K=K,
                cycles = ncycles)
  res <- rbind(res, p[ncycles+1,])
}

#Proportions of the 2 species
prop2 <- df_targetU %>% group_by(sp) %>%
  summarise(propU = median(prop)) %>% ungroup() %>%
  mutate(propU = propU/(propU+propU[11])) %>% #11 = Rca
  left_join(summary_experiment %>% dplyr::select(sp, efficiencies)) %>%
  mutate(deff = 1-efficiencies) %>% filter(sp != "Rc")
## Joining with `by = join_by(sp)`

Figure 5:

#Labels positions
prop2$labelx <- prop2$deff - 0.0021
prop2[prop2$sp == "Rfe","labelx"] <- prop2[prop2$sp == "Rfe","labelx"]+
  0.0042
prop2[prop2$sp == "Lco","labelx"] <- prop2[prop2$sp == "Lco","labelx"]+
  0.0042
prop2[prop2$sp == "Lxy","labelx"] <- prop2[prop2$sp == "Lxy","labelx"]+
  0.0042
prop2[prop2$sp == "Aal","labelx"] <- prop2[prop2$sp == "Aal","labelx"]+
  0.0038
prop2[prop2$sp == "Bme","labelx"] <- prop2[prop2$sp == "Bme","labelx"]+
  0.0037
prop2[prop2$sp == "Cbe","labelx"] <- prop2[prop2$sp == "Cbe","labelx"]+
  0.0021


ggplot()+
    geom_vline(data = prop2 %>% filter(sp != "Rca"), aes(xintercept = deff),
             col = "grey80")+
    geom_line(aes(1-Lam2/lam1, res[,1]/(rowSums(res)),
                  col = "Rosa canina"))+
  geom_line(aes(1-Lam2/lam1, res[,2]/(rowSums(res)), col = "other"))+
  geom_hline(yintercept = 0.5, linetype = "dashed")+
  # ggtitle("Simulated mock communities of two species with equal starting quantities")+
  # theme(plot.title = element_text(hjust = 0.5))+
  geom_point(data = prop2 %>% filter(sp != "Rca"),
             aes(deff, propU, col = "other"))+
  geom_point(data = prop2 %>% filter(sp != "Rca"),
             aes(deff, 1-propU, col = "Rosa canina"))+
  geom_text(aes(0.135, 0.73, label = "Rosa canina",
                col = "Rosa canina"),
            size = 4)+
  geom_text(aes(0.135, 0.27, label = "Second species", col = "other"),
            size = 4)+
  geom_text(data = prop2 %>%
              filter(sp != "Rca") %>% arrange(efficiencies),
              aes(labelx, 1, label = sp),
            col = "grey50", size = 3, angle = 90)+
    geom_text(data = prop2 %>% filter(sp %in% c("Cbe", "Cbp", "Fex")) %>%
                arrange(efficiencies),
              aes(labelx, 1, label = sp),
            col = "grey25", size = 3, angle = 90)+
  labs(x = "Relative decrease of PCR efficiency compared to Rosa canina",
       y = "Final relative abundances", col = "Species")+
  coord_cartesian(xlim = c(0.006, 0.15), ylim = c(0,1))+
  theme(legend.position = "none")+
  scale_colour_brewer(palette = "Set1")

  #   geom_text_repel(
  #   data = prop2 %>% filter(sp != "Rca"),
  #   aes(deff, propU, label = sp),
  #   size = 4,
  #   min.segment.length = unit(0, 'lines'))

# rep(c(1,0.95), 6)

# ggsave(filename = "Figures/simu_efficiencies.pdf", units = "mm",
#        height = 100, width = 150, device = "pdf", dpi = 600)

Uncertainty on K

efficiencies_inferK0 <- cbind(1,
                            read_csv("data/efficiencies_U_K0.csv"))
## Rows: 10 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): Column1, Column2, Column3, Column4, Column5, Column6, Column7, Col...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(efficiencies_inferK0) <- colnames(readsU)
efficiencies_inferK0 <- efficiencies_inferK0 %>%
              as_tibble() %>%
              pivot_longer(1:13, names_to = "species_name",
                           values_to = "efficiencies")
efficiencies_inferK0$K <- K*10

efficiencies_inferK2 <- cbind(1,
                            read_csv("data/efficiencies_U_K2.csv"))
## Rows: 10 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): Column1, Column2, Column3, Column4, Column5, Column6, Column7, Col...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(efficiencies_inferK2) <- colnames(readsU)
efficiencies_inferK2 <- efficiencies_inferK2 %>%
              as_tibble() %>%
              pivot_longer(1:13, names_to = "species_name",
                           values_to = "efficiencies")
efficiencies_inferK2$K <- K/10

efficiencies_inferK3 <- cbind(1,
                            read_csv("data/efficiencies_U_K3.csv"))
## Rows: 10 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): Column1, Column2, Column3, Column4, Column5, Column6, Column7, Col...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(efficiencies_inferK3) <- colnames(readsU)
efficiencies_inferK3 <- efficiencies_inferK3 %>%
              as_tibble() %>%
              pivot_longer(1:13, names_to = "species_name",
                           values_to = "efficiencies")
efficiencies_inferK3$K <- K/100

efficiencies_inferK4 <- cbind(1,
                            read_csv("data/efficiencies_U_K4.csv"))
## Rows: 10 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): Column1, Column2, Column3, Column4, Column5, Column6, Column7, Col...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(efficiencies_inferK4) <- colnames(readsU)
efficiencies_inferK4 <- efficiencies_inferK4 %>%
              as_tibble() %>%
              pivot_longer(1:13, names_to = "species_name",
                           values_to = "efficiencies")
efficiencies_inferK4$K <- K/1000

efficiencies_inferK1 <- efficiencies_infer %>%
              as_tibble() %>%
              pivot_longer(1:13, names_to = "species_name",
                           values_to = "efficiencies")
efficiencies_inferK1$K <- K

eff_infer_sf1 <- rbind(efficiencies_inferK0,
                       efficiencies_inferK1,
                       efficiencies_inferK2,
                       efficiencies_inferK3,
                       efficiencies_inferK4) %>%
  left_join(summary_experiment[,2:3])
## Joining with `by = join_by(species_name)`

Supplementary Figure 1

Lambda_s = f(K)

eff_infer_sf1 <- eff_infer_sf1 %>%
         filter(species_name != "Rosa_canina" & K > 1e11)

ggplot(eff_infer_sf1 %>%
         group_by(sp, K) %>%
         summarise(efficiencies = mean(efficiencies)))+
  geom_point(aes(K, efficiencies, col = sp, group = sp))+
  geom_line(aes(K, efficiencies, col = sp, group = sp),
            linetype = "dashed")+
  # geom_segment(aes(x = K/100,
  #                  xend = max(eff_infer_sf1$K),
  #                  y = 1, yend = 1,
  #                  col = "Rca"))+
  geom_vline(xintercept = K, linetype = "dashed")+
  scale_x_log10()+
  labs(y = TeX("Efficiencies $\\Lambda_s$"), col = "Species")+
    theme(
        legend.title.align=0.5,
        legend.text.align = 0.5)
## `summarise()` has grouped output by 'sp'. You can override using the `.groups`
## argument.

ggsave(filename = "Figures/Lambda_fctK.pdf",
       dpi = 600, units = "mm",
       width = 150, height = 100, device = "pdf")
eff_infer_sf1 <- eff_infer_sf1 %>% left_join(
  eff_infer_sf1 %>% filter(sp == "Gro") %>% dplyr::select(K, efficiencies) %>%
  rename(lambda_min = efficiencies))
## Joining with `by = join_by(K)`
ggplot(eff_infer_sf1 %>%
         filter(species_name != "Rosa_canina" & K > 1e11) %>%
         group_by(sp, K) %>%
         summarise(efficiencies = mean(efficiencies), lambda_min = lambda_min[1]))+
  geom_point(aes(K, efficiencies/lambda_min, col = sp, group = sp))+
  geom_line(aes(K, efficiencies/lambda_min, col = sp, group = sp),
            linetype = "dashed")+
  geom_vline(xintercept = K, linetype = "dashed")+
  scale_x_log10()+
  labs(y = TeX("Efficiencies $\\Lambda_s / \\Lambda_{Gro}$"), col = "Species")+
    theme(
        legend.title.align=0.5,
        legend.text.align = 0.5)
## `summarise()` has grouped output by 'sp'. You can override using the `.groups`
## argument.

# ggsave(filename = "Figures/Lambda_Lmin_fctK.pdf",
#        dpi = 600, units = "mm",
#        width = 150, height = 100, device = "pdf")

Supplementary Figure 2

inferT0 <- read.csv("data/prop_inferT_K0.csv")
colnames(inferT0) <- colnames(readsT)
inferT0 <- inferT0 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferT0$K <- K*10

inferT1 <- read.csv("data/prop_inferT_K1.csv")
colnames(inferT1) <- colnames(readsT)
inferT1 <- inferT1 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferT1$K <- K

inferT2 <- read.csv("data/prop_inferT_K2.csv")
colnames(inferT2) <- colnames(readsT)
inferT2 <- inferT2 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferT2$K <- K/10

inferT3 <- read.csv("data/prop_inferT_K3.csv")
colnames(inferT3) <- colnames(readsT)
inferT3 <- inferT3 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferT3$K <- K/100

inferT4 <- read.csv("data/prop_inferT_K4.csv")
colnames(inferT4) <- colnames(readsT)
inferT4 <- inferT4 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferT4$K <- K/1000

inferTK <- rbind(inferT0, inferT1, inferT2, inferT3, inferT4)
inferTK <- inferTK %>% left_join(summary_experiment[,2:3])
## Joining with `by = join_by(species_name)`
ggplot(inferTK %>% filter(K > 1e11))+geom_point(aes(K, prop_infer_Lambda, col = sp))+
  scale_x_log10()+
  geom_vline(xintercept = K)

inferG0 <- read.csv("data/prop_inferG_K0.csv")
colnames(inferG0) <- colnames(readsG)
inferG0 <- inferG0 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferG0$K <- K*10

inferG1 <- read.csv("data/prop_inferG_K1.csv")
colnames(inferG1) <- colnames(readsG)
inferG1 <- inferG1 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferG1$K <- K

inferG2 <- read.csv("data/prop_inferG_K2.csv")
colnames(inferG2) <- colnames(readsG)
inferG2 <- inferG2 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferG2$K <- K/10

inferG3 <- read.csv("data/prop_inferG_K3.csv")
colnames(inferG3) <- colnames(readsG)
inferG3 <- inferG3 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferG3$K <- K/100

inferG4 <- read.csv("data/prop_inferG_K4.csv")
colnames(inferG4) <- colnames(readsG)
inferG4 <- inferG4 %>%
              pivot_longer(everything(),
                           names_to = "species_name",
                           values_to = "prop_infer_Lambda") %>%
  arrange(-dplyr::row_number())
inferG4$K <- K/1000

inferGK <- rbind(inferG0, inferG1, inferG2, inferG3, inferG4)
inferGK <- inferGK %>% left_join(summary_experiment[,2:3])
## Joining with `by = join_by(species_name)`
ggplot(inferGK %>% filter(K > 1e11))+
  geom_point(aes(K, prop_infer_Lambda, col = sp))+
  scale_x_log10()+scale_y_log10()+
  geom_vline(xintercept = K)

Supplementary Figure 2

rmse0 <- read.csv("data/at_rt_ag_rg_K0.csv")
rmse1 <- read.csv("data/at_rt_ag_rg_K1.csv")
rmse2 <- read.csv("data/at_rt_ag_rg_K2.csv")
rmse3 <- read.csv("data/at_rt_ag_rg_K3.csv")
rmse4 <- read.csv("data/at_rt_ag_rg_K4.csv")

res_rmse <- cbind(t(cbind(rmse0, rmse1, rmse2, rmse3, rmse4)),
      c(K*10, K, K/10, K/100, K/1000))
colnames(res_rmse) <- c("AbsT", "RelT", "AbsG", "RelG", "K")
res_rmse[,1] <- res_rmse[,1]/res_rmse[2,1]
res_rmse[,2] <- res_rmse[,2]/res_rmse[2,2]
res_rmse[,3] <- res_rmse[,3]/res_rmse[2,3]
res_rmse[,4] <- res_rmse[,4]/res_rmse[2,4]
res_rmse
##              AbsT      RelT      AbsG      RelG            K
## Column1 0.9951259 0.9988668 0.9928391 1.0022559 1.204428e+14
## Column1 1.0000000 1.0000000 1.0000000 1.0000000 1.204428e+13
## Column1 0.9995183 1.0013051 1.0054587 1.0099123 1.204428e+12
## Column1 1.0016628 0.9995328 0.9832693 0.9989560 1.204428e+11
## Column1 0.9927982 0.9977561 1.0281356 0.9995787 1.204428e+10
ggplot(as.tibble(res_rmse) %>% filter(K > 1e11))+
  geom_point(aes(K, AbsT, col = "Abs M_T"))+
  geom_point(aes(K, AbsG, col = "Abs M_G"))+
  geom_point(aes(K, RelT, col = "Rel M_T"))+
  geom_point(aes(K, RelG, col = "Rel M_G"))+
  scale_x_log10()+
  geom_vline(xintercept = K)+
  labs(x = "K", y = "RMSE / RMSE(true K)", col = "Error type")

# ggsave(filename = "Figures/RMSE_fctK.pdf",
#        dpi = 600, units = "mm",
#        width = 150, height = 100, device = "pdf")

Supplementary Table 1: C-values

Mass of one genome for the 13 species.

summary_experiment$Cvalue <- c(0.46, 0.45, 1.03, 0.84, 0.70, 17.29,
                               1.02, 6.35, 1.40, 0.40, 1.76, 0.74, 0.87)

summary_experiment <- summary_experiment %>%
  mutate(copies_genome = Molecules_ng*1e9*Cvalue*1e-12)

summary_experiment
## # A tibble: 13 × 21
##    Sample species_name         sp    Molecules_ng CDNA_sample Volume_equi cDNA_U
##     <dbl> <chr>                <chr>        <dbl>       <dbl>       <dbl>  <dbl>
##  1      4 Salvia_pratensis     Spr        152880        24.4         1.62 0.0624
##  2      6 Populus_tremula      Ptr        248096        31.4         1    0.0385
##  3     10 Carpinus_betulus     Cbe         52667.        9.14        4.71 0.181 
##  4     12 Fraxinus_excelsior   Fex         87040        22.4         2.85 0.110 
##  5     16 Lonicera_xylosteum   Lxy         73740        45.8         3.36 0.129 
##  6     18 Abies_alba           Aal         89653.        3.58        2.77 0.106 
##  7     20 Acer_campestre       Aca        134683.       12.2         1.84 0.0708
##  8     22 Briza_media          Bme        125573.      183           1.98 0.0760
##  9     24 Rosa_canina          Rca        211640        50.8         1.17 0.0451
## 10     26 Capsella_bursa-past… Cbp         38533.       38.8         6.44 0.248 
## 11     28 Geranium_robertianum Gro         82293.       15           3.01 0.116 
## 12     30 Rhododendron_ferrug… Rfe         37483.        3.9         6.62 0.255 
## 13     32 Lotus_corniculatus   Lco        140480        65.2         1.77 0.0679
## # ℹ 14 more variables: molecules_U <dbl>, propU <dbl>, RankG <dbl>,
## #   molecules_T <dbl>, propT <dbl>, cDNA_G <dbl>, molecules_G <dbl>,
## #   propG <dbl>, efficiencies <dbl>, prop_obsU <dbl>, prop_obsT <dbl>,
## #   prop_obsG <dbl>, Cvalue <dbl>, copies_genome <dbl>

Supplementary Table 2: initial communities composition

summary_experiment %>% arrange(RankG)
## # A tibble: 13 × 21
##    Sample species_name         sp    Molecules_ng CDNA_sample Volume_equi cDNA_U
##     <dbl> <chr>                <chr>        <dbl>       <dbl>       <dbl>  <dbl>
##  1     22 Briza_media          Bme        125573.      183           1.98 0.0760
##  2     24 Rosa_canina          Rca        211640        50.8         1.17 0.0451
##  3     32 Lotus_corniculatus   Lco        140480        65.2         1.77 0.0679
##  4      6 Populus_tremula      Ptr        248096        31.4         1    0.0385
##  5      4 Salvia_pratensis     Spr        152880        24.4         1.62 0.0624
##  6     16 Lonicera_xylosteum   Lxy         73740        45.8         3.36 0.129 
##  7     12 Fraxinus_excelsior   Fex         87040        22.4         2.85 0.110 
##  8     20 Acer_campestre       Aca        134683.       12.2         1.84 0.0708
##  9     26 Capsella_bursa-past… Cbp         38533.       38.8         6.44 0.248 
## 10     28 Geranium_robertianum Gro         82293.       15           3.01 0.116 
## 11     10 Carpinus_betulus     Cbe         52667.        9.14        4.71 0.181 
## 12     18 Abies_alba           Aal         89653.        3.58        2.77 0.106 
## 13     30 Rhododendron_ferrug… Rfe         37483.        3.9         6.62 0.255 
## # ℹ 14 more variables: molecules_U <dbl>, propU <dbl>, RankG <dbl>,
## #   molecules_T <dbl>, propT <dbl>, cDNA_G <dbl>, molecules_G <dbl>,
## #   propG <dbl>, efficiencies <dbl>, prop_obsU <dbl>, prop_obsT <dbl>,
## #   prop_obsG <dbl>, Cvalue <dbl>, copies_genome <dbl>

Simulations of mock communities

100 simulated communities for each of the 3 M0tot values with 13 species

###Varying initial quantity, final abundances

The initial number of molecules varies in the M_U community:

#Different values of total number of molecules at the beginning
M0tot <- 2.5e5*c(1e-1, 1, 1e1)

nsim <- 100

compo <- NULL

for (sim in 1:nsim){
  for (m0tot in M0tot){
    #Composition of this community
    MU_varM0 <- pcr_sim(m0=rep(m0tot/nrow(summary_experiment),
                           nrow(summary_experiment)),
                    lambda=summary_experiment$efficiencies,
                    K=K,
                cycles = ncycles)
    compo <- rbind(compo, data.frame(sim = sim,
                                   M0tot = m0tot,
                                   sp = summary_experiment$sp,
                                   ps = MU_varM0[nrow(MU_varM0),]/
                                     sum(MU_varM0[nrow(MU_varM0),])))
  }
}

ggplot(compo %>% filter(sp %in% c("Rca", "Gro")))+
  geom_boxplot(aes(factor(M0tot), ps, col = sp))+
  geom_point(data = data.frame(),
             aes(factor(M0tot[2]),
                 (summary_experiment %>%
                    filter(sp %in% c("Rca", "Gro")) %>% pull(prop_obsU))),
  shape = 23, fill = "gold")+
  scale_color_brewer(palette = "Set1")

#adjust positions for real proportions...

compo %>% filter(sp %in% c("Rca", "Gro")) %>% group_by(M0tot, sp) %>%
  summarise(mps = mean(ps))
## `summarise()` has grouped output by 'M0tot'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups:   M0tot [3]
##     M0tot sp       mps
##     <dbl> <chr>  <dbl>
## 1   25000 Gro   0.0206
## 2   25000 Rca   0.187 
## 3  250000 Gro   0.0243
## 4  250000 Rca   0.172 
## 5 2500000 Gro   0.0285
## 6 2500000 Rca   0.157

Varying initial quantities, inferred abundances

Data are generated with the logistic model. Corrections are brought with our approach and with the ratio method (assuming exponential model)

Simulate communities

compo_ref <- compo %>% filter(M0tot == 2.5e5) %>% group_by(sp) %>%
  summarise(med_ps = median(ps)) %>%
  mutate(correc = med_ps/(1/nrow(summary_experiment)))
#observed ps/expected ps
#with expected ps = 1/13

compo <- compo %>% left_join(summary_experiment[,c("sp", "species_name")]) %>%
  left_join(compo_ref) %>%
  mutate(ps_ratio = ps/correc)
## Joining with `by = join_by(sp)`
## Joining with `by = join_by(sp)`
compo <- compo %>% left_join(
    compo %>% group_by(sim, M0tot) %>%
      summarise(sum_ps_ratio = sum(ps_ratio)) ) %>%
  mutate(ps_ratio = ps_ratio/sum_ps_ratio) %>%
  dplyr::select(-sum_ps_ratio)
## `summarise()` has grouped output by 'sim'. You can override using the `.groups`
## argument.
## Joining with `by = join_by(sim, M0tot)`

Export to Julia (for inference)

for(i in 1:length(M0tot)){
  m0tot <- M0tot[i]
  mat <- compo %>% filter(M0tot == m0tot) %>%
    dplyr::select(sim, species_name, ps) %>%
    pivot_wider(names_from = species_name, values_from = ps) %>%
    dplyr::select(colnames(readsU)) %>% as.matrix()
  # write.csv(mat, paste0("data/sim100_m0tot_", i, ".csv"),row.names = F)
}

Inference is performed in file inference_simulated_communities.jl

Inference results computed in Julia

res_commu1 <- read.csv("data/res_commu1.csv")
res_commu2 <- read.csv("data/res_commu2.csv")
res_commu3 <- read.csv("data/res_commu3.csv")

colnames(res_commu1) <- colnames(readsU)
colnames(res_commu2) <- colnames(readsU)
colnames(res_commu3) <- colnames(readsU)

res_commu1 <- res_commu1 %>% as_tibble() %>%
  mutate(sim = 1:100, M0tot = 2.5e4) %>%
  pivot_longer(1:13, names_to = "species_name", values_to = "ps_log") %>%
  left_join(summary_experiment %>% dplyr::select(sp, species_name))
## Joining with `by = join_by(species_name)`
res_commu2 <- res_commu2 %>% as_tibble() %>%
  mutate(sim = 1:100, M0tot = 2.5e5) %>%
  pivot_longer(1:13, names_to = "species_name", values_to = "ps_log") %>%
  left_join(summary_experiment %>% dplyr::select(sp, species_name))
## Joining with `by = join_by(species_name)`
res_commu3 <- res_commu3 %>% as_tibble() %>%
  mutate(sim = 1:100, M0tot = 2.5e6) %>%
  pivot_longer(1:13, names_to = "species_name", values_to = "ps_log") %>%
  left_join(summary_experiment %>% dplyr::select(sp, species_name))
## Joining with `by = join_by(species_name)`
res_infer_log <- rbind(res_commu1, res_commu2, res_commu3)

Supplementary Figure 3

to compare the two approaches (ratio method and efficiencies inference)

ggplot(compo)+
  geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
                   ps_ratio,
                   group = sp, col = "Ratio"), alpha = 0.1)+
  geom_boxplot(data = res_infer_log, aes(sp, ps_log,
                   group = sp, col = "Logistic model"), alpha = 0.1)+
  geom_point(data = summary_experiment,
             aes(sp, 1/nrow(summary_experiment),
                 fill = "Expected\nwithout\nPCR bias"),
             shape = 23,
             size = 3)+
  facet_wrap(M0tot~., ncol = 1)+
  # scale_color_manual(values = colp)+
  scale_fill_manual(values = colp)+
  labs(x = "Species", y = "Corrected reads proportions",
       fill = "", col = "Correction")+
  scale_color_brewer(palette = "Set1")+
  ggtitle(TeX("$M_0^{{total}}$")) +
  theme(legend.title = element_text(hjust = 0.5),
    plot.title = element_text(hjust = 0.5),
    axis.text.x=element_markdown())+
  scale_x_discrete(labels = function(x) highlight(x, "Cb|Fe"))

# ggsave(filename = "Figures/simu_exp_log.pdf",
#        dpi = 600, units = "mm",
#        width = 150, height = 180, device = "pdf")